home *** CD-ROM | disk | FTP | other *** search
Text File | 1998-04-16 | 1.7 KB | 76 lines | [TEXT/CWIE] |
- // JapaneseTokenizer.h
- // Copyright: © 1996 - 1998 by Apple Computer, Inc., all rights reserved.
-
- // This tokenizer breaks on changes from one writing system to another. In
- // 1-byte text, it breaks on most non-alphanumeric characters, but allows
- // emebedded periods and commas because they might be decimal points.
-
- #pragma once
- #ifndef JapaneseTokenizer_h
- #define JapaneseTokenizer_h
-
- #pragma import on
- #if PRAGMA_STRUCT_ALIGN
- #pragma options align=power
- #endif
-
- #include <ctype.h>
- #include <Script.h> // Macintosh platform specific include
-
- #include "IAAnalysis.h"
- #include "IACharStream.h"
-
- #pragma IA_BEGIN_EXPORTS
-
- class DoubleByteChar
- {
- public:
- UInt16 code;
- UInt8 len;
- UInt8 kind;
- IACharStream* fCharStream;
- bool ungetFlag; // if true then do NOT getchar()
- unsigned long fCurrentPos; // position in stream
-
-
- DoubleByteChar(IACharStream* stream) :
- code(0), len(0), kind(0), ungetFlag(false),
- fCurrentPos(0), fCharStream(stream) {}
- ~DoubleByteChar() {}
-
- OSErr GetNextDBChar();
- short EndOfFile();
- void UngetChar() {ungetFlag = true;}
- UInt32 GetCurrentPos() {return fCharStream->CurrentPos();}
- private:
- bool IsTokenChar(int c){return isalnum(c) || strchr(".," ,c);}
-
- };
-
- class JapaneseTokenizer : public IATokenStream
- {
- public:
- JapaneseTokenizer(IACharStream* stream);
- ~JapaneseTokenizer(); // deletes fCharStream
-
- IAToken* GetNextToken();
- void GetTextSpan(byte* buffer, uint32 startPos, uint32 endPos);
- private:
- IAToken* GetNextTokenInternal();
- bool BuildNextToken(StringPtr token, UInt32* startPos, UInt32* endPos);
- IACharStream* fCharStream;
- DoubleByteChar* fDBChar;
- char* buffer;
- };
-
- #pragma IA_END_EXPORTS
-
- #if PRAGMA_STRUCT_ALIGN
- #pragma options align=reset
- #endif
-
- #pragma import reset
-
-
- #endif
-